library(EnrichmentBrowser)
library(GSEABenchmarkeR)
cb.pink <- "#CC79A7"
cb.darkred <- "#B42F32"
cb.red <- "#D55E00"
cb.lightred <- "#DF6747"
cb.blue <- "#0072B2"
cb.yellow <- "#F0E442"
cb.green <- "#009E73"
cb.lightblue <- "#56B4E9"
cb.lightorange <- "#FAAC77"
cb.orange <- "#E69F00"
cb.darkorange <- "#F6893D"
cb.lightgrey <- "#C9C9BD"
cb.darkgrey <- "#878D92"
data.dir <- rappdirs::user_data_dir("GSEABenchmarkeR")
ma.dir <- file.path(data.dir, "GEO2KEGG_preproc")
geo2kegg <- loadEData(ma.dir)
##
|
| | 0%
|
|== | 2%
|
|=== | 5%
|
|===== | 7%
|
|====== | 10%
|
|======== | 12%
|
|========== | 14%
|
|=========== | 17%
|
|============= | 19%
|
|============== | 21%
|
|================ | 24%
|
|================== | 26%
|
|=================== | 29%
|
|===================== | 31%
|
|====================== | 33%
|
|======================== | 36%
|
|========================== | 38%
|
|=========================== | 40%
|
|============================= | 43%
|
|============================== | 45%
|
|================================ | 48%
|
|================================== | 50%
|
|=================================== | 52%
|
|===================================== | 55%
|
|====================================== | 57%
|
|======================================== | 60%
|
|========================================= | 62%
|
|=========================================== | 64%
|
|============================================= | 67%
|
|============================================== | 69%
|
|================================================ | 71%
|
|================================================= | 74%
|
|=================================================== | 76%
|
|===================================================== | 79%
|
|====================================================== | 81%
|
|======================================================== | 83%
|
|========================================================= | 86%
|
|=========================================================== | 88%
|
|============================================================= | 90%
|
|============================================================== | 93%
|
|================================================================ | 95%
|
|================================================================= | 98%
|
|===================================================================| 100%
ma.ids <- names(geo2kegg)
ma.ids
## [1] "GSE1145" "GSE11906" "GSE1297"
## [4] "GSE14762" "GSE14924_CD4" "GSE14924_CD8"
## [7] "GSE15471" "GSE16515" "GSE16759"
## [10] "GSE18842" "GSE19188" "GSE19420"
## [13] "GSE19728" "GSE20153" "GSE20164"
## [16] "GSE20291" "GSE21354" "GSE22780"
## [19] "GSE23878" "GSE24739_G0" "GSE24739_G1"
## [22] "GSE30153" "GSE32676" "GSE3467"
## [25] "GSE3585" "GSE3678" "GSE38666_epithelia"
## [28] "GSE38666_stroma" "GSE4107" "GSE4183"
## [31] "GSE42057" "GSE5281_EC" "GSE5281_HIP"
## [34] "GSE5281_VCX" "GSE6956AA" "GSE6956C"
## [37] "GSE7305" "GSE781" "GSE8671"
## [40] "GSE8762" "GSE9348" "GSE9476"
rseq.dir <- file.path(data.dir, "TCGA_preproc")
rseq.raw <- file.path(rseq.dir, "GSE62944_matched_limmavoom", "tcga_paired_voomlimma.rds")
rseq.vst <- file.path(rseq.dir, "GSE62944_matched_vst", "tcga_paired_vst_limma.rds")
rseq.tpm <- file.path(rseq.dir, "cTD_matched_tpm", "cTD_matched_tpm.rds")
tcga.raw <- readRDS(rseq.raw)
tcga.vst <- readRDS(rseq.vst)
tcga.tpm <- readRDS(rseq.tpm)
rseq.ids <- names(tcga.raw)
rseq.ids
## [1] "BLCA" "BRCA" "COAD" "HNSC" "KICH" "KIRC" "KIRP" "LIHC" "LUAD" "LUSC"
## [11] "PRAD" "READ" "STAD" "THCA" "UCEC"
geo2kegg <- runDE(geo2kegg, padj.method="BH")
bh.padj <- function(se)
{
rowData(se)$ADJ.PVAL <- p.adjust(rowData(se)$PVAL, method="BH")
return(se)
}
tcga.raw <- lapply(tcga.raw, bh.padj)
tcga.vst <- lapply(tcga.vst, bh.padj)
plotDEDistribution(geo2kegg)
plotDEDistribution(tcga.raw)
plotDEDistribution(tcga.vst)
Comparison of DE results for different RNA-seq approaches: - raw read counts + voom/limma - vst counts + limma - log2 TPMs + limma
# VST
cors.vst <- vapply(rseq.ids,
function(id)
cor(rowData(tcga.raw[[id]])$FC,
rowData(tcga.vst[[id]])$FC),
numeric(1))
sort(round(cors.vst, digits=3))
## LIHC BLCA STAD KIRP HNSC UCEC THCA KICH BRCA PRAD KIRC READ
## 0.964 0.971 0.975 0.979 0.982 0.985 0.987 0.990 0.991 0.991 0.992 0.992
## LUAD LUSC COAD
## 0.993 0.993 0.995
corsp.vst <- vapply(rseq.ids,
function(id)
cor(-log(rowData(tcga.raw[[id]])$PVAL, base=10),
-log(rowData(tcga.vst[[id]])$PVAL, base=10)),
numeric(1))
sort(round(corsp.vst, digits=3))
## KIRP LIHC BLCA KICH THCA UCEC STAD HNSC READ KIRC LUSC BRCA
## 0.964 0.970 0.976 0.976 0.979 0.980 0.982 0.983 0.984 0.985 0.986 0.987
## COAD LUAD PRAD
## 0.989 0.989 0.992
# TPM
cors.tpm <- vapply(names(tcga.tpm),
function(id)
cor(rowData(tcga.tpm[[id]])$FC,
rowData(tcga.raw[[id]][names(tcga.tpm[[id]]),])$FC),
numeric(1))
sort(round(cors.tpm, digits=3))
## PRAD STAD BLCA LUAD THCA KIRP LIHC BRCA KICH KIRC LUSC HNSC
## 0.992 0.992 0.993 0.993 0.994 0.995 0.995 0.996 0.996 0.997 0.997 0.998
corsp.tpm <- vapply(names(tcga.tpm),
function(id)
cor(-log(rowData(tcga.tpm[[id]])$PVAL, base=10),
-log(rowData(tcga.raw[[id]][names(tcga.tpm[[id]]),])$PVAL, base=10)),
numeric(1))
sort(round(corsp.tpm, digits=3))
## PRAD LUAD KIRP STAD BLCA THCA LIHC BRCA KICH KIRC HNSC LUSC
## 0.959 0.971 0.975 0.978 0.980 0.984 0.985 0.986 0.986 0.989 0.992 0.992
# log TPM
tcga.logtpm <- lapply(tcga.tpm,
function(se)
{
assay(se) <- log(assay(se) + 1, base=2)
return(se)
})
tcga.logtpm <- runDE(tcga.logtpm)
cors.logtpm <- vapply(names(tcga.logtpm),
function(id)
cor(rowData(tcga.logtpm[[id]])$FC,
rowData(tcga.raw[[id]][names(tcga.logtpm[[id]]),])$FC),
numeric(1))
sort(round(cors.logtpm, digits=3))
## LIHC BLCA KIRP STAD THCA HNSC KICH LUAD PRAD BRCA KIRC LUSC
## 0.956 0.966 0.967 0.967 0.980 0.982 0.983 0.985 0.985 0.987 0.988 0.989
corsp.logtpm <- vapply(names(tcga.logtpm),
function(id)
cor(-log(rowData(tcga.logtpm[[id]])$PVAL, base=10),
-log(rowData(tcga.raw[[id]][names(tcga.logtpm[[id]]),])$PVAL, base=10)),
numeric(1))
sort(round(corsp.logtpm, digits=3))
## KICH KIRP THCA PRAD STAD BRCA BLCA LUAD KIRC LIHC LUSC HNSC
## 0.875 0.924 0.929 0.948 0.951 0.960 0.963 0.963 0.964 0.964 0.975 0.978
# overall
fc.mat <- data.frame(cors.vst, cors.tpm[rseq.ids], cors.logtpm[rseq.ids])
round(fc.mat, digits=3)
## cors.vst cors.tpm.rseq.ids. cors.logtpm.rseq.ids.
## BLCA 0.971 0.993 0.966
## BRCA 0.991 0.996 0.987
## COAD 0.995 NA NA
## HNSC 0.982 0.998 0.982
## KICH 0.990 0.996 0.983
## KIRC 0.992 0.997 0.988
## KIRP 0.979 0.995 0.967
## LIHC 0.964 0.995 0.956
## LUAD 0.993 0.993 0.985
## LUSC 0.993 0.997 0.989
## PRAD 0.991 0.992 0.985
## READ 0.992 NA NA
## STAD 0.975 0.992 0.967
## THCA 0.987 0.994 0.980
## UCEC 0.985 NA NA
p.mat <- data.frame(corsp.vst, corsp.tpm[rseq.ids], corsp.logtpm[rseq.ids])
round(p.mat, digits=3)
## corsp.vst corsp.tpm.rseq.ids. corsp.logtpm.rseq.ids.
## BLCA 0.976 0.980 0.963
## BRCA 0.987 0.986 0.960
## COAD 0.989 NA NA
## HNSC 0.983 0.992 0.978
## KICH 0.976 0.986 0.875
## KIRC 0.985 0.989 0.964
## KIRP 0.964 0.975 0.924
## LIHC 0.970 0.985 0.964
## LUAD 0.989 0.971 0.963
## LUSC 0.986 0.992 0.975
## PRAD 0.992 0.959 0.948
## READ 0.984 NA NA
## STAD 0.982 0.978 0.951
## THCA 0.979 0.984 0.929
## UCEC 0.980 NA NA
Frequently used data set, one of the first to use microarray data in a classification context. Golub et al. (1999)
library(golubEsets)
library(vsn)
data(Golub_Train)
exprs(Golub_Train) <- exprs(vsn2(Golub_Train))
## vsn2: 7129 x 38 matrix (1 stratum).
## Please use 'meanSdPlot' to verify the fit.
se <- probe2gene(Golub_Train)
## Loading required package: hu6800.db
## Loading required package: AnnotationDbi
## Loading required package: org.Hs.eg.db
##
##
## Encountered 360 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
se$GROUP <- ifelse(se$ALL.AML == "ALL", 1, 0)
se <- deAna(se)
se
## class: SummarizedExperiment
## dim: 5523 38
## metadata(3): experimentData annotation protocolData
## assays(1): exprs
## rownames(5523): 6772 2597 ... 3397 7277
## rowData names(4): FC limma.STAT PVAL ADJ.PVAL
## colnames(38): 1 2 ... 32 33
## colData names(12): Samples ALL.AML ... Source GROUP
kegg.gs <- getGenesets(org="hsa", db="kegg")
length(kegg.gs)
## [1] 330
(min.size <- EnrichmentBrowser::configEBrowser("GS.MIN.SIZE"))
## [1] 5
(max.size <- EnrichmentBrowser::configEBrowser("GS.MAX.SIZE"))
## [1] 500
ind <- (lengths(kegg.gs) >= min.size) & (lengths(kegg.gs) <= max.size)
kegg.gs <- kegg.gs[ind]
length(kegg.gs)
## [1] 324
MASS::truehist(lengths(kegg.gs), nbins=50)
go.gs <- getGenesets(org="hsa", db="go", go.onto="BP")
##
length(go.gs)
## [1] 12078
ind <- (lengths(go.gs) >= min.size) & (lengths(go.gs) <= max.size)
go.gs <- go.gs[ind]
length(go.gs)
## [1] 4705
MASS::truehist(lengths(go.gs), nbins=50)
res.dir <- file.path(data.dir, "results")
geo.dir <- file.path(res.dir, "GEO2KEGG")
ea.methods <- sbeaMethods()[1:10]
kegg.dir <- file.path(geo.dir, "kegg/perm1k")
ma.kegg.rtimes <- readResults(kegg.dir, ma.ids,
methods=ea.methods, type="runtime")
lengths(ma.kegg.rtimes)
## ora safe gsea gsa padog globaltest roast
## 42 42 42 42 42 42 42
## camera gsva samgs
## 42 42 42
ma.kegg.rtimes[1:2]
## $ora
## GSE1145 GSE11906 GSE1297 GSE14762
## 0.360 0.498 0.367 0.459
## GSE14924_CD4 GSE14924_CD8 GSE15471 GSE16515
## 0.309 0.466 0.469 0.424
## GSE16759 GSE18842 GSE19188 GSE19420
## 0.283 0.483 0.339 0.421
## GSE19728 GSE20153 GSE20164 GSE20291
## 0.458 0.430 0.229 0.377
## GSE21354 GSE22780 GSE23878 GSE24739_G0
## 0.286 0.433 0.445 0.271
## GSE24739_G1 GSE30153 GSE32676 GSE3467
## 0.262 0.346 0.288 0.426
## GSE3585 GSE3678 GSE38666_epithelia GSE38666_stroma
## 0.353 0.283 0.464 0.433
## GSE4107 GSE4183 GSE42057 GSE5281_EC
## 0.278 0.446 0.212 0.291
## GSE5281_HIP GSE5281_VCX GSE6956AA GSE6956C
## 0.441 0.356 0.225 0.222
## GSE7305 GSE781 GSE8671 GSE8762
## 0.247 0.226 0.438 0.248
## GSE9348 GSE9476
## 0.310 0.199
##
## $safe
## GSE1145 GSE11906 GSE1297 GSE14762
## 11.480 24.782 6.050 10.399
## GSE14924_CD4 GSE14924_CD8 GSE15471 GSE16515
## 8.420 9.809 24.841 13.115
## GSE16759 GSE18842 GSE19188 GSE19420
## 0.837 29.785 41.487 12.182
## GSE19728 GSE20153 GSE20164 GSE20291
## 10.359 8.430 3.225 10.748
## GSE21354 GSE22780 GSE23878 GSE24739_G0
## 7.380 8.942 12.836 3.503
## GSE24739_G1 GSE30153 GSE32676 GSE3467
## 4.290 7.279 11.517 9.048
## GSE3585 GSE3678 GSE38666_epithelia GSE38666_stroma
## 3.787 7.010 11.406 7.631
## GSE4107 GSE4183 GSE42057 GSE5281_EC
## 6.109 10.940 16.118 6.303
## GSE5281_HIP GSE5281_VCX GSE6956AA GSE6956C
## 7.072 9.420 1.077 5.570
## GSE7305 GSE781 GSE8671 GSE8762
## 5.433 5.184 18.191 5.951
## GSE9348 GSE9476
## 16.891 15.657
go.dir <- file.path(geo.dir, "go_bp")
ma.go.rtimes <- readResults(go.dir, ma.ids,
methods=ea.methods, type="runtime")
lengths(ma.go.rtimes)
## ora safe gsea gsa padog globaltest roast
## 42 42 42 42 42 42 42
## camera gsva samgs
## 42 42 42
bpPlot(ma.kegg.rtimes, what="runtime")
bpPlot(ma.go.rtimes, what="runtime")
Using ggpubr / ggplot2:
library(ggpubr)
plotRuntime2 <- function(rtimes)
{
x <- do.call(cbind, rtimes)
df <- reshape2::melt(x)
df$value <- log(df$value, base=10)
df$Var2 <- substring(df$Var2, 1, 7)
medians <- vapply(rtimes, median, numeric(1))
o <- names(sort(medians))
o <- substring(o, 1, 7)
p <- ggboxplot(df, x = "Var2", y = "value", width = 0.8,
ylab="log10 runtime [sec]", xlab="", order=o, fill="Var2")
p <- ggpar(p, x.text.angle=45, palette = "simpsons", legend="none") +
grids("y", linetype = "dashed")
return(p)
}
p <- plotRuntime2(ma.kegg.rtimes)
p + geom_label(x=10, y=0, label="1 sec", col="grey") +
geom_label(x=1.5, y=1, label="10 sec", col="grey") +
geom_label(x=1.5, y=2, label="1 min 40 sec", col="grey")
p <- plotRuntime2(ma.go.rtimes)
p + geom_label(x=10, y=1, label="10 sec", col="grey") +
geom_label(x=1.5, y=2, label="1 min 40 sec", col="grey") +
geom_label(x=1.5, y=3, label="15 min 40 sec", col="grey")
tcga.dir <- file.path(res.dir, "TCGA")
vst.dir <- file.path(tcga.dir, "GSE62944_matched_vst")
rseq.kegg.dir <- file.path(vst.dir, "kegg")
rseq.go.dir <- file.path(vst.dir, "gobp")
rseq.kegg.rtimes <- readResults(rseq.kegg.dir, rseq.ids,
methods=ea.methods, type="runtime")
rseq.go.rtimes <- readResults(rseq.go.dir, rseq.ids,
methods=ea.methods, type="runtime")
bpPlot(rseq.kegg.rtimes, what="runtime")
bpPlot(rseq.go.rtimes, what="runtime")
facetplot <- function(ma.kegg, ma.go, rseq.kegg, rseq.go,
ylab="% significant sets", vline=6.5, log=FALSE)
{
l <- list(ma.kegg=ma.kegg, ma.go=ma.go, rseq.kegg=rseq.kegg, rseq.go=rseq.go)
df <- reshape2::melt(l)
gsc <- vapply(df$L1, function(x) unlist(strsplit(x,"\\."))[2],
character(1), USE.NAMES=FALSE)
df <- cbind(df, gsc=gsc)
df$gsc <- toupper(df$gsc)
df$gsc <- vapply(df$gsc, function(n)
ifelse(n == "GO", paste(n, "BP", sep="-"), n),
character(1), USE.NAMES=FALSE)
df$gsc <- factor(df$gsc, levels=c("KEGG", "GO-BP"))
colnames(df)[1:2] <- c("dataset", "method")
colnames(df)[4] <- "compendium"
df$compendium <- sub("ma.kegg", "GEO2KEGG microarray", df$compendium)
df$compendium <- sub("rseq.go", "TCGA RNA-seq", df$compendium)
df$compendium <- sub("rseq.kegg", "TCGA RNA-seq", df$compendium)
df$compendium <- sub("ma.go", "GEO2KEGG microarray", df$compendium)
df$method <- substring(df$method, 1, 7)
if(log) df$value <- log(df$value, base=10)
o <- sort(vapply(split(df$value, df$method),
median, numeric(1), na.rm=TRUE))
df$method <- factor(df$method, levels=names(o))
p <- ggboxplot(df, x = "method", y = "value",
width = 0.8, ylab=ylab, xlab="", fill="method")
p <- ggpar(p, x.text.angle=45, palette = "simpsons", legend="none")
if(!is.na(vline))
p <- p + geom_vline(xintercept=vline, linetype="dashed", color = cb.darkgrey)
p <- facet(p, facet.by=c("compendium", "gsc"))
return(p)
}
p <- facetplot(do.call(cbind, ma.kegg.rtimes),
do.call(cbind, ma.go.rtimes),
do.call(cbind, rseq.kegg.rtimes),
do.call(cbind, rseq.go.rtimes),
ylab="log10 runtime [sec]",
vline=NA, log=TRUE)
p + grids("y", linetype = "dashed") +
geom_label(x=9, y=0, label="1 sec", col="grey", size=2.5) +
geom_label(x=9, y=1, label="10 sec", col="grey", size=2.5) +
geom_label(x=2, y=2, label="1 min 40 sec", col="grey", size=2.5) +
geom_label(x=2, y=3, label="15 min 40 sec", col="grey", size=2.5)
Checking different RNA-seq modes (raw vs. vst):
raw.dir <- file.path(tcga.dir, "GSE62944_matched_limmavoom")
raw.kegg.dir <- file.path(raw.dir, "kegg")
raw.kegg.rtimes <- readResults(raw.kegg.dir, rseq.ids,
methods=list.files(raw.kegg.dir), type="runtime")
rtimes <- c(raw.kegg.rtimes[c("camera", "gsva", "roast")],
rseq.kegg.rtimes[c(c("camera", "gsva", "roast"))])
names(rtimes) <- paste(names(rtimes), c(rep("raw",3), rep("vst", 3)), sep=".")
rtimes <- rtimes[c(4,1,6,3,5,2)]
rtimes <- lapply(rtimes, log, base = 10)
ylab <- "log10 runtime [sec]"
par(las = 2)
boxplot(rtimes, col = rep(rainbow(3), each=2), ylab = ylab)
safe.kegg.rtimes <- list(vst=rseq.kegg.rtimes$safe, raw=raw.kegg.rtimes$safe)
bpPlot(safe.kegg.rtimes, what="runtime")
Median runtime:
Three groups:
ma.kegg.ranks <- readResults(kegg.dir, ma.ids,
methods=ea.methods, type="ranking")
ma.go.ranks <- readResults(go.dir, ma.ids,
methods=ea.methods, type="ranking")
lengths(ma.kegg.ranks)
## ora safe gsea gsa padog globaltest roast
## 42 42 42 42 42 42 42
## camera gsva samgs
## 42 42 42
ma.kegg.ranks$ora[1:2]
## $GSE1145
## DataFrame with 292 rows and 4 columns
## GENE.SET NR.GENES
## <character> <numeric>
## 1 hsa05010_Alzheimer's_disease 164
## 2 hsa03018_RNA_degradation 76
## 3 hsa05016_Huntington's_disease 187
## 4 hsa04071_Sphingolipid_signaling_pathway 118
## 5 hsa05142_Chagas_disease_(American_trypanosomiasis) 102
## ... ... ...
## 288 hsa02010_ABC_transporters 43
## 289 hsa05033_Nicotine_addiction 40
## 290 hsa04080_Neuroactive_ligand-receptor_interaction 274
## 291 hsa00430_Taurine_and_hypotaurine_metabolism 11
## 292 hsa00524_Butirosin_and_neomycin_biosynthesis 5
## NR.SIG.GENES PVAL
## <numeric> <numeric>
## 1 57 0.000754
## 2 30 0.00146
## 3 62 0.00176
## 4 41 0.00397
## 5 36 0.00503
## ... ... ...
## 288 4 0.996
## 289 3 0.998
## 290 43 1
## 291 0 1
## 292 0 1
##
## $GSE11906
## DataFrame with 292 rows and 4 columns
## GENE.SET NR.GENES
## <character> <numeric>
## 1 hsa04380_Osteoclast_differentiation 130
## 2 hsa00051_Fructose_and_mannose_metabolism 32
## 3 hsa04668_TNF_signaling_pathway 110
## 4 hsa00565_Ether_lipid_metabolism 42
## 5 hsa04917_Prolactin_signaling_pathway 72
## ... ... ...
## 288 hsa00460_Cyanoamino_acid_metabolism 7
## 289 hsa00750_Vitamin_B6_metabolism 6
## 290 hsa00232_Caffeine_metabolism 5
## 291 hsa00400_Phenylalanine,_tyrosine_and_tryptophan_biosynthesis 5
## 292 hsa00524_Butirosin_and_neomycin_biosynthesis 5
## NR.SIG.GENES PVAL
## <numeric> <numeric>
## 1 8 0.00264
## 2 4 0.00266
## 3 7 0.00404
## 4 4 0.00718
## 5 5 0.0103
## ... ... ...
## 288 0 1
## 289 0 1
## 290 0 1
## 291 0 1
## 292 0 1
ma.kegg.sig.sets <- evalNrSigSets(ma.kegg.ranks, alpha=0.05, padj="none")
bpPlot(ma.kegg.sig.sets, what="sig.sets")
ma.go.sig.sets <- evalNrSigSets(ma.go.ranks, alpha=0.05, padj="none")
bpPlot(ma.go.sig.sets, what="sig.sets")
rseq.kegg.ranks <- readResults(rseq.kegg.dir, rseq.ids,
methods=ea.methods, type="ranking")
rseq.kegg.sig.sets <- evalNrSigSets(rseq.kegg.ranks, alpha=0.05, padj="none")
rseq.go.ranks <- readResults(rseq.go.dir, rseq.ids,
methods=ea.methods, type="ranking")
rseq.go.sig.sets <- evalNrSigSets(rseq.go.ranks, alpha=0.05, padj="none")
facetplot(ma.kegg.sig.sets, ma.go.sig.sets, rseq.kegg.sig.sets, rseq.go.sig.sets)
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
golub.dir <- file.path(res.dir, "golub")
random.dir <- file.path(golub.dir, "randomGS", "true_labels")
gs.grid <- c(5, 10, 25, 50, 100, 250, 500)
gs.grid <- paste0("gs", gs.grid)
ea.random <- readResults(random.dir, gs.grid,
methods=ea.methods, type="ranking")
sig.sets <- evalNrSigSets(ea.random, alpha=0.05, padj="none")
ind <- order( apply(sig.sets, 2, median) )
sig.sets <- sig.sets[, ind]
round(sig.sets, digits=1)
## ora camera safe padog gsea gsa gsva roast globaltest samgs
## gs5 5.0 5 5.0 5 7.0 7.0 21.0 24.0 43 54
## gs10 2.0 5 2.0 4 8.0 7.0 26.0 32.0 63 78
## gs25 3.0 3 4.0 5 8.0 3.0 25.0 24.0 94 98
## gs50 3.0 5 5.0 5 10.0 9.0 29.0 33.0 98 100
## gs100 5.0 1 7.0 3 5.0 3.0 23.0 18.0 100 100
## gs250 1.0 0 8.0 6 3.0 7.0 23.0 14.0 100 100
## gs500 3.8 0 7.7 2 1.9 3.6 17.3 11.5 100 100
GSEABenchmarkeR:::plotGSSizeRobustness(sig.sets, what="sig.sets")
df <- reshape2::melt(sig.sets)
colnames(df) <- c("Size", "Method", "value")
df$Method <- substring(df$Method, 1, 7)
df$Method <- factor(df$Method, levels=substring(rev(colnames(sig.sets)),1,7))
df$Size <- sub("^gs", "", df$Size)
col <- rev(get_palette(palette = "simpsons", 10))
p <- ggline(df, x = "Size", y = "value", linetype = "Method", color="Method",
palette = "simpsons", ylab="% significant sets", xlab="gene set size") +
geom_hline(yintercept=5, linetype="dashed", color = cb.darkgrey)
p
ma.kegg.sig.sets <- evalNrSigSets(ma.kegg.ranks, alpha=0.05, padj="BH")
bpPlot(ma.kegg.sig.sets, what="sig.sets")
ma.go.sig.sets <- evalNrSigSets(ma.go.ranks, alpha=0.05, padj="BH")
bpPlot(ma.go.sig.sets, what="sig.sets")
rseq.kegg.sig.sets <- evalNrSigSets(rseq.kegg.ranks, alpha=0.05, padj="BH")
rseq.go.sig.sets <- evalNrSigSets(rseq.go.ranks, alpha=0.05, padj="BH")
facetplot(ma.kegg.sig.sets, ma.go.sig.sets, rseq.kegg.sig.sets, rseq.go.sig.sets)
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
fdr.dir <- file.path(kegg.dir, "builtInFDR")
fdr.methods <- c("gsea", "safe")
ea.kegg.ranks.fdr <- readResults(fdr.dir, ma.ids,
methods=fdr.methods, type="ranking")
sig.sets <- evalNrSigSets(ea.kegg.ranks.fdr)
bpPlot(sig.sets, what="sig.sets")
kegg.dir <- sub("1k$", "10k", kegg.dir)
perm.methods <- c("gsa", "gsea", "padog", "roast", "safe", "samgs")
ea.kegg.ranks.10k <- readResults(kegg.dir, ma.ids,
methods=perm.methods, type="ranking")
sig.sets <- evalNrSigSets(ea.kegg.ranks.10k, alpha=0.05, padj="BH")
bpPlot(sig.sets, what="sig.sets")
fdr.dir <- file.path(kegg.dir, "builtInFDR")
ea.kegg.ranks.fdr.10k <- readResults(fdr.dir, ma.ids,
methods=fdr.methods, type="ranking")
sig.sets <- evalNrSigSets(ea.kegg.ranks.fdr.10k)
bpPlot(sig.sets, what="sig.sets")
ma.de <- vapply(geo2kegg, function(se) GSEABenchmarkeR:::.fractDE(se), numeric(2))
ma.de <- ma.de["p",]
rseq.de <- vapply(tcga.raw, function(se) GSEABenchmarkeR:::.fractDE(se), numeric(2))
rseq.de <- rseq.de["p",]
plotCorDE <- function(de, sig.sets)
{
cors <- vapply(ea.methods,
function(m) cor(de, sig.sets[,m], use="complete.obs"),
numeric(1))
o <- order(cors, decreasing=TRUE)
df <- reshape2::melt(sig.sets)
colnames(df) <- c("Dataset", "Method", "value")
df$xvalue <- de[as.vector(df$Dataset)]
df$Method <- substring(df$Method, 1, 7)
df$Method <- factor(df$Method,
levels=substring(colnames(sig.sets)[o],1,7))
col <- rev(get_palette(palette = "simpsons", 10))
ggline(df, x = "xvalue", y = "value", numeric.x.axis = TRUE,
linetype = "Method", color="Method",
palette = "simpsons", ylab="% significant sets", xlab="% DE genes")
}
plotCorDE(ma.de, ma.go.sig.sets)
## Warning in cor(de, sig.sets[, m], use = "complete.obs"): the standard
## deviation is zero
## Warning: Removed 2 rows containing missing values (geom_point).
cor.facetplot <- function(de, kegg.sig.sets, go.sig.sets)
{
cors <- vapply(ea.methods,
function(m) cor(de, kegg.sig.sets[,m], use="complete.obs"),
numeric(1))
o <- order(cors, decreasing=TRUE)
l <- list(KEGG=kegg.sig.sets, GO=go.sig.sets)
df <- reshape2::melt(l)
colnames(df)[c(1:2,4)] <- c("dataset", "method", "gsc")
df$xvalue <- de[as.vector(df$dataset)]
df$method <- substring(df$method, 1, 7)
df$method <- factor(df$method,
levels=substring(colnames(kegg.sig.sets)[o],1,7))
col <- rev(get_palette(palette = "simpsons", 10))
p <- ggline(df, x = "xvalue", y = "value", numeric.x.axis = TRUE,
linetype = "method", color="method",
palette = "simpsons", ylab="% significant sets", xlab="% DE genes")
p <- facet(p, facet.by="gsc")
return(p)
}
cor.facetplot(ma.de, ma.kegg.sig.sets, ma.go.sig.sets)
## Warning in cor(de, kegg.sig.sets[, m], use = "complete.obs"): the standard
## deviation is zero
## Warning: Removed 4 rows containing missing values (geom_point).
cor.facetplot(rseq.de, rseq.kegg.sig.sets, rseq.go.sig.sets)
## Warning in cor(de, kegg.sig.sets[, m], use = "complete.obs"): the standard
## deviation is zero
kegg.dir <- file.path(golub.dir, "kegg")
go.dir <- file.path(golub.dir, "go")
tyI <- evalTypeIError("globaltest", se, gs=kegg.gs, alpha=0.05, nperm=1000)
golgt.file <- file.path(kegg.dir, "globaltest.rds")
tyI.globaltest <- readRDS(golgt.file)
tyI.globaltest
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.006515 0.016287 0.050866 0.056189 0.732899
tyI.samgs <- readRDS(sub("globaltest", "samgs", golgt.file))
tyI.samgs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.003257 0.009772 0.050039 0.039088 0.833876
Using nominal DE p-values for ORA and EBM.
res <- evalTypeIError(ea.methods, se, gs=kegg.gs, save2file=TRUE)
readTyI <- function(res.dir)
{
res.files <- file.path(res.dir, paste0(c(sbeaMethods()[1:10],"camera_igcNA"), ".rds"))
res.files <- res.files[file.exists(res.files)]
res <- sapply(res.files, readRDS)
colnames(res) <- basename(colnames(res))
colnames(res) <- sub(".rds$", "", colnames(res))
return(res)
}
kegg.res <- readTyI(kegg.dir)
colnames(kegg.res)[11] <- sub("_igcNA$", "*", colnames(kegg.res)[11])
go.res <- readTyI(go.dir)
colnames(go.res)[11] <- sub("_igcNA$", "*", colnames(go.res)[11])
GSEABenchmarkeR:::plotTypeIError(kegg.res)
GSEABenchmarkeR:::plotTypeIError(go.res)
plotTypeIError2 <- function(data, ylabel=0.4)
{
data <- t(data)
rownames(data) <- substring(rownames(data), 1, 7)
data <- data[order(data[,"Max."] - data[,"Mean"]),]
df <- data.frame(methods=rownames(data), data)
colnames(df)[2:7] <- c("y0", "y25", "y50", "mean", "y75", "y100")
df[,1] <- factor(df[,1], levels=df[,1])
df.points <- data.frame(x=1:11, y=df$mean)
p <- ggplot() +
geom_boxplot(data=df, width = 0.8,
aes(x=df[,1], ymin=y0, lower=y25, middle=y50, upper=y75, ymax=y100),
stat="identity", fill="grey92") + theme_pubr()
#get_palette(palette = "simpsons", 10)
p <- ggpar(p, x.text.angle=45, legend="none") + xlab("") + ylab("type I error rate") +
geom_point(data=df.points, aes(x=x, y=y), color=cb.blue) +
geom_hline(yintercept=0.05, linetype="dashed", color = cb.red) +
geom_vline(xintercept=7.5, linetype="dashed", color = cb.darkgrey) +
geom_label(aes(x=4, y=ylabel), label="competitive", color = cb.darkgrey) +
geom_label(aes(x=9.5, y=ylabel), label="self-contained", color = cb.darkgrey)
return(p)
}
plotTypeIError2(kegg.res, ylabel=0.8)
plotTypeIError2(go.res)
TODO: recompute for ORA and EBM
random.dir <- file.path(golub.dir, "randomGS", "shuffled_labels")
ea.random.typeI <- readResults(random.dir, gs.grid,
methods=ea.methods, type="typeI")
ea.random.typeI
## $ora
## gs5 gs10 gs25 gs50 gs100 gs250 gs500
## Min. 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.0000000000
## 1st Qu. 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.0000000000
## Median 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.0000000000
## Mean 0.00012 0.00017 0.00037 0.00084 0.00115 0.00245 0.0002142857
## 3rd Qu. 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.0000000000
## Max. 0.03000 0.04000 0.05000 0.06000 0.08000 0.09000 0.0952380952
##
## $safe
## gs5 gs10 gs25 gs50 gs100 gs250 gs500
## Min. 0.00000 0.00000 0.0000 0.0000 0.01000 0.00000 0.00000000
## 1st Qu. 0.04000 0.03000 0.0300 0.0400 0.03000 0.03000 0.01851852
## Median 0.05000 0.05000 0.0500 0.0500 0.05000 0.05000 0.03703704
## Mean 0.04961 0.04874 0.0489 0.0492 0.04834 0.04814 0.04840741
## 3rd Qu. 0.06000 0.06000 0.0600 0.0600 0.06000 0.06000 0.07407407
## Max. 0.12000 0.11000 0.1200 0.1200 0.15000 0.14000 0.16666667
##
## $gsea
## gs5 gs10 gs25 gs50 gs100 gs250 gs500
## Min. 0.00000 0.00000 0.00000 0.00000 0.00000 0.000 0.00000000
## 1st Qu. 0.04000 0.04000 0.03000 0.03000 0.03000 0.020 0.00000000
## Median 0.05000 0.05000 0.05000 0.05000 0.04000 0.040 0.01818182
## Mean 0.04956 0.05336 0.04772 0.04876 0.04812 0.053 0.04500000
## 3rd Qu. 0.06000 0.07000 0.06000 0.06000 0.06750 0.070 0.05454545
## Max. 0.11000 0.13000 0.11000 0.13000 0.14000 0.360 0.41818182
##
## $gsa
## gs5 gs10 gs25 gs50 gs100 gs250 gs500
## Min. 0.01000 0.01000 0.00000 0.0000 0.0000 0.0000 0.00000000
## 1st Qu. 0.04000 0.03750 0.03000 0.0400 0.0400 0.0400 0.02173913
## Median 0.05000 0.05000 0.05000 0.0500 0.0500 0.0500 0.04347826
## Mean 0.04995 0.04905 0.04815 0.0543 0.0496 0.0493 0.04826087
## 3rd Qu. 0.06000 0.06000 0.06000 0.0700 0.0600 0.0600 0.06521739
## Max. 0.12000 0.11000 0.11000 0.1200 0.1000 0.1200 0.15217391
##
## $padog
## gs5 gs10 gs25 gs50 gs100 gs250 gs500
## Min. 0.01000 0.00000 0.02000 0.01000 0.01000 0.01000 0.00000000
## 1st Qu. 0.04000 0.04000 0.04000 0.04000 0.04000 0.04000 0.03448276
## Median 0.05000 0.05000 0.05000 0.05000 0.05000 0.05000 0.05172414
## Mean 0.05102 0.05062 0.04896 0.05064 0.05004 0.04896 0.04922414
## 3rd Qu. 0.06000 0.06000 0.06000 0.06000 0.06000 0.06000 0.06896552
## Max. 0.11000 0.11000 0.09000 0.09000 0.09000 0.09000 0.10344828
##
## $globaltest
## gs5 gs10 gs25 gs50 gs100 gs250 gs500
## Min. 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000000
## 1st Qu. 0.02000 0.02000 0.01000 0.00000 0.00000 0.00000 0.00000000
## Median 0.04000 0.04000 0.03000 0.02000 0.01000 0.00000 0.00000000
## Mean 0.04865 0.05001 0.05124 0.04479 0.05552 0.04443 0.05838889
## 3rd Qu. 0.07000 0.07000 0.06000 0.05000 0.04000 0.01000 0.00000000
## Max. 0.42000 0.56000 0.65000 0.67000 0.99000 1.00000 1.00000000
##
## $roast
## gs5 gs10 gs25 gs50 gs100 gs250 gs500
## Min. 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000000
## 1st Qu. 0.03000 0.03000 0.03000 0.02000 0.02000 0.01000 0.00000000
## Median 0.04000 0.04000 0.04000 0.04000 0.04000 0.02000 0.00000000
## Mean 0.04885 0.04826 0.04952 0.04811 0.04984 0.04779 0.05170370
## 3rd Qu. 0.06000 0.06000 0.06000 0.06000 0.06000 0.05000 0.03703704
## Max. 0.26000 0.20000 0.32000 0.32000 0.33000 0.63000 0.83333333
##
## $camera
## gs5 gs10 gs25 gs50 gs100 gs250 gs500
## Min. 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0
## 1st Qu. 0.03000 0.03000 0.02000 0.01000 0.00000 0.00000 0
## Median 0.04000 0.04000 0.03000 0.01000 0.00000 0.00000 0
## Mean 0.04444 0.04522 0.02775 0.01617 0.00576 0.00023 0
## 3rd Qu. 0.06000 0.06000 0.04000 0.02000 0.01000 0.00000 0
## Max. 0.12000 0.11000 0.11000 0.07000 0.04000 0.01000 0
##
## $gsva
## gs5 gs10 gs25 gs50 gs100 gs250 gs500
## Min. 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000000
## 1st Qu. 0.03000 0.03000 0.03000 0.03000 0.03000 0.03000 0.01851852
## Median 0.05000 0.05000 0.05000 0.05000 0.05000 0.05000 0.03703704
## Mean 0.05061 0.05017 0.05093 0.04956 0.05103 0.04923 0.05075926
## 3rd Qu. 0.07000 0.07000 0.07000 0.07000 0.07000 0.06000 0.07407407
## Max. 0.26000 0.23000 0.19000 0.16000 0.20000 0.18000 0.24074074
##
## $samgs
## gs5 gs10 gs25 gs50 gs100 gs250 gs500
## Min. 0.00000 0.00000 0.00000 0.0000 0.0000 0.00000 0.00000000
## 1st Qu. 0.02000 0.01000 0.00000 0.0000 0.0000 0.00000 0.00000000
## Median 0.04000 0.03000 0.02000 0.0000 0.0000 0.00000 0.00000000
## Mean 0.04897 0.05033 0.05198 0.0422 0.0557 0.04307 0.05575926
## 3rd Qu. 0.06000 0.06000 0.05000 0.0300 0.0100 0.00000 0.00000000
## Max. 0.52000 0.77000 0.92000 0.8600 1.0000 1.00000 1.00000000
mean.mat <- vapply(ea.random.typeI, function(x) x["Mean",], numeric(7))
med.mat <- vapply(ea.random.typeI, function(x) x["Median",], numeric(7))
GSEABenchmarkeR:::plotGSSizeRobustness(mean.mat, what="typeI")
GSEABenchmarkeR:::plotGSSizeRobustness(med.mat, what="typeI")
Adjusted p-values: flattens out signal for most methods, independent of gene set DB (GO/KEGG), number of permutations, and whether a built-in correction is used; recommendable for roast and gsva; does not make a difference for globaltest and samgs
camera (real gene sets) and gsa (real and random gene sets)globaltest and samgs accurately control the type I error rate, yet their sensitive null hypothesis (self-contained) seems to render them of limited practical relevanceFrom the globaltest vignette (which is a top 5% Bioc package):
Still, it is true that the null hypotheses that the global test tests is false even if only a single gene in the gene set is associated with the phenotype; especially smaller gene sets may therefore become significant as a result of only a single significant gene.
However, because the test is directed especially against the alternative that there are many associated genes, such examples are rare among larger gene sets.
Rare = >90% ??
data.dir <- system.file("extdata", package="GSEABenchmarkeR")
mala.kegg.file <- file.path(data.dir, "malacards", "KEGG.rds")
mala.go.file <- file.path(data.dir, "malacards", "GO_BP.rds")
mala.kegg <- readRDS(mala.kegg.file)
mala.go <- readRDS(mala.go.file)
vapply(mala.kegg, nrow, integer(1))
## ACC ALZ BLCA BRCA CESC CHOL CML COAD CRC DCM DLBC DMND ESCA GBM HNSC
## 9 57 65 142 22 33 56 28 161 23 52 99 90 99 72
## HUNT KICH KIRC KIRP LAML LES LGG LIHC LUAD LUSC MESO OV PAAD PARK PCPG
## 34 4 8 8 108 49 24 98 54 23 3 31 70 39 12
## PDCO PRAD READ SARC SKCM STAD TGCT THCA THYM UCEC UCS UVM
## 31 12 2 73 42 24 24 81 61 90 29 55
mala.kegg$ALZ
## DataFrame with 57 rows and 4 columns
## TITLE REL.SCORE MATCHED.GENES
## <character> <numeric> <integer>
## hsa05010 Alzheimers disease 84.12 28
## hsa04932 Non-alcoholic fatty liver disease (NAFLD) 84.12 7
## hsa04726 Serotonergic synapse 49.19 8
## hsa04728 Dopaminergic synapse 49.19 8
## hsa04713 Circadian entrainment 49.19 5
## ... ... ... ...
## hsa05310 Asthma 9.81 1
## hsa05416 Viral myocarditis 9.81 2
## hsa05330 Allograft rejection 9.81 1
## hsa05332 Graft-versus-host disease 9.81 2
## hsa05321 Inflammatory bowel disease (IBD) 9.81 2
## TOTAL.GENES
## <integer>
## hsa05010 177
## hsa04932 160
## hsa04726 115
## hsa04728 130
## hsa04713 98
## ... ...
## hsa05310 35
## hsa05416 64
## hsa05330 41
## hsa05332 45
## hsa05321 67
mala.kegg$BRCA
## DataFrame with 142 rows and 4 columns
## TITLE REL.SCORE MATCHED.GENES TOTAL.GENES
## <character> <numeric> <integer> <integer>
## hsa05210 Colorectal cancer 166.1 35 70
## hsa05213 Endometrial cancer 166.1 23 61
## hsa05221 Acute myeloid leukemia 166.1 16 60
## hsa05218 Melanoma 166.1 35 78
## hsa05215 Prostate cancer 166.1 40 95
## ... ... ... ... ...
## hsa05020 Prion diseases 13.23 6 43
## hsa05144 Malaria 11.34 6 55
## hsa05143 African trypanosomiasis 11.28 5 36
## hsa04720 Long-term potentiation 10.93 7 67
## hsa05134 Legionellosis 10.81 6 59
d2d.file <- file.path(data.dir, "malacards", "GseId2Disease.txt")
d2d.map <- readDataId2diseaseCodeMap(d2d.file)
head(d2d.map)
## GSE1145 GSE11906 GSE1297 GSE14762 GSE14924_CD4
## "DCM" "PDCO" "ALZ" "KIRC" "LAML"
## GSE14924_CD8
## "LAML"
d2d.tcga <- rseq.ids
names(d2d.tcga) <- rseq.ids
ma.kegg.ranks$ora$GSE1297
## DataFrame with 290 rows and 4 columns
## GENE.SET NR.GENES
## <character> <numeric>
## 1 hsa00190_Oxidative_phosphorylation 106
## 2 hsa05010_Alzheimer's_disease 148
## 3 hsa05012_Parkinson's_disease 115
## 4 hsa05016_Huntington's_disease 162
## 5 hsa04932_Non-alcoholic_fatty_liver_disease_(NAFLD) 135
## ... ... ...
## 286 hsa03040_Spliceosome 112
## 287 hsa04914_Progesterone-mediated_oocyte_maturation 76
## 288 hsa00232_Caffeine_metabolism 5
## 289 hsa00460_Cyanoamino_acid_metabolism 5
## 290 hsa00524_Butirosin_and_neomycin_biosynthesis 5
## NR.SIG.GENES PVAL
## <numeric> <numeric>
## 1 55 1.98e-12
## 2 67 2.6e-11
## 3 56 3.43e-11
## 4 70 1.28e-10
## 5 48 7.08e-05
## ... ... ...
## 286 12 0.999
## 287 5 1
## 288 0 1
## 289 0 1
## 290 0 1
obs.score <- evalRelevance(ma.kegg.ranks$ora$GSE1297, mala.kegg$ALZ)
obs.score
## [1] 802.6036
gs.names <- ma.kegg.ranks$ora$GSE1297$GENE.SET
gs.ids <- substring(gs.names, 1, 8)
opt.score <- compOpt(mala.kegg$ALZ, gs.ids)
opt.score
## [1] 1200.644
round(obs.score / opt.score * 100, digits=2)
## [1] 66.85
rand.scores <- compRand(mala.kegg$ALZ, gs.ids, perm=50)
summary(rand.scores)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 495.6 602.3 636.8 636.3 675.3 725.0
(sum(rand.scores >= obs.score) + 1) / 51
## [1] 0.01960784
ma.kegg.rel.sets <- evalRelevance(ma.kegg.ranks, mala.kegg, d2d.map)
ma.go.rel.sets <- evalRelevance(ma.go.ranks, mala.go, d2d.map)
bpPlot(ma.kegg.rel.sets, what="rel.sets")
rseq.kegg.rel.sets <- evalRelevance(rseq.kegg.ranks, mala.kegg, d2d.tcga)
rseq.go.rel.sets <- evalRelevance(rseq.go.ranks, mala.go, d2d.tcga)
facetplot(ma.kegg.rel.sets, ma.go.rel.sets, rseq.kegg.rel.sets,
rseq.go.rel.sets, ylab="% optimal relevance score", vline=4.5)